/* This should eventually end up in machdep/, but for now, x86 is the
   only target, and it's easier this way... */

/*************************************************************************
* Some basic information about the the target CPU                       *
*************************************************************************/

#define EAX_INDEX 0
#define ECX_INDEX 1
#define EDX_INDEX 2
#define EBX_INDEX 3
#define ESP_INDEX 4
#define EBP_INDEX 5
#define ESI_INDEX 6
#define EDI_INDEX 7
#if defined (__x86_64__)
    #define R8_INDEX 8
    #define R9_INDEX 9
    #define R10_INDEX 10
    #define R11_INDEX 11
    #define R12_INDEX 12
    #define R13_INDEX 13
    #define R14_INDEX 14
    #define R15_INDEX 15
#endif
/* XXX this has to match X86_Reg8H_Base + 4 */
#define AH_INDEX (0x10 + 4 + EAX_INDEX)
#define CH_INDEX (0x10 + 4 + ECX_INDEX)
#define DH_INDEX (0x10 + 4 + EDX_INDEX)
#define BH_INDEX (0x10 + 4 + EBX_INDEX)

/* The register in which subroutines return an integer return value */
#define REG_RESULT EAX_INDEX

/* The registers subroutines take their first and second argument in */
#if defined (_WIN32)
/* Handle the _fastcall parameters of ECX and EDX */
    #define REG_PAR1 ECX_INDEX
    #define REG_PAR2 EDX_INDEX
#elif defined (__x86_64__)
    #define REG_PAR1 EDI_INDEX
    #define REG_PAR2 ESI_INDEX
#else
    #define REG_PAR1 EAX_INDEX
    #define REG_PAR2 EDX_INDEX
#endif

#if defined (_WIN32)
    #define REG_PC_PRE EAX_INDEX /* The register we use for preloading regs.pc_p */
    #define REG_PC_TMP ECX_INDEX
    #define SHIFTCOUNT_NREG ECX_INDEX /* Register that can be used for shiftcount. -1 if any reg will do */
#else
    #define REG_PC_PRE EAX_INDEX /* The register we use for preloading regs.pc_p */
    #define REG_PC_TMP ECX_INDEX /* Another register that is not the above */
    #define SHIFTCOUNT_NREG ECX_INDEX /* Register that can be used for shiftcount. -1 if any reg will do */
#endif

#define MUL_NREG1 EAX_INDEX /* %eax will hold the low 32 bits after a 32x32 mul */
#define MUL_NREG2 EDX_INDEX /* %edx will hold the high 32 bits */

#define STACK_ALIGN 16
#define STACK_OFFSET sizeof(void*)

byte always_used[] = { 4, 0xff };
#if defined (__x86_64__)
sbyte can_byte[] = { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 };
sbyte can_word[] = { 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1 };
#else
byte can_byte[] = { 0, 1, 2, 3, 0xff };
byte can_word[] = { 0, 1, 2, 3, 5, 6, 7, 0xff };
#endif

byte call_saved[] = { 0, 0, 0, 0, 1, 0, 0, 0 };

/* This *should* be the same as call_saved. But:
   - We might not really know which registers are saved, and which aren't,
     so we need to preserve some, but don't want to rely on everyone else
     also saving those registers
   - Special registers (such like the stack pointer) should not be "preserved"
     by pushing, even though they are "saved" across function calls
 */
byte need_to_preserve[] = { 1, 1, 1, 1, 0, 1, 1, 1 };

/* Whether classes of instructions do or don't clobber the native flags */
#define CLOBBER_MOV
#define CLOBBER_LEA
#define CLOBBER_CMOV
#define CLOBBER_POP
#define CLOBBER_PUSH
#define CLOBBER_SUB clobber_flags()
#define CLOBBER_SBB clobber_flags()
#define CLOBBER_CMP clobber_flags()
#define CLOBBER_ADD clobber_flags()
#define CLOBBER_ADC clobber_flags()
#define CLOBBER_AND clobber_flags()
#define CLOBBER_OR clobber_flags()
#define CLOBBER_XOR clobber_flags()

#define CLOBBER_ROL clobber_flags()
#define CLOBBER_ROR clobber_flags()
#define CLOBBER_SHLL clobber_flags()
#define CLOBBER_SHRL clobber_flags()
#define CLOBBER_SHRA clobber_flags()
#define CLOBBER_TEST clobber_flags()
#define CLOBBER_CL16
#define CLOBBER_CL8
#define CLOBBER_SE16
#define CLOBBER_SE8
#define CLOBBER_ZE16
#define CLOBBER_ZE8
#define CLOBBER_SW16 clobber_flags()
#define CLOBBER_SW32
#define CLOBBER_SETCC
#define CLOBBER_MUL clobber_flags()
#define CLOBBER_BT clobber_flags()
#define CLOBBER_BSF clobber_flags()

/*************************************************************************
* Actual encoding of the instructions on the target CPU                 *
*************************************************************************/

// #include "compemu_optimizer_x86.c"

static __forceinline ushort swap16(ushort x)
{
    return ((x & 0xff00) >> 8) | ((x & 0x00ff) << 8);
}

static __forceinline uint swap32(uint x)
{
    return ((x & 0xff00) << 8) | ((x & 0x00ff) << 24) | ((x & 0xff0000) >> 8) | ((x & 0xff000000) >> 24);
}

static __forceinline int isbyte(int x)
{
    return (x >= -128 && x <= 127);
}

LOWFUNC(NONE, WRITE, 1, raw_push_l_r, (R4 r))
{
    emit_byte(0x50 + r);
}
LENDFUNC(NONE, WRITE, 1, raw_push_l_r, (R4 r))

LOWFUNC(NONE, READ, 1, raw_pop_l_r, (R4 r))
{
    emit_byte(0x58 + r);
}
LENDFUNC(NONE, READ, 1, raw_pop_l_r, (R4 r))

LOWFUNC(WRITE, NONE, 2, raw_bt_l_ri, (R4 r, IMM i))
{
    emit_byte(0x0f);
    emit_byte(0xba);
    emit_byte(0xe0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_bt_l_ri, (R4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_bt_l_rr, (R4 r, R4 b))
{
    emit_byte(0x0f);
    emit_byte(0xa3);
    emit_byte(0xc0 + 8 * b + r);
}
LENDFUNC(WRITE, NONE, 2, raw_bt_l_rr, (R4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_btc_l_ri, (RW4 r, IMM i))
{
    emit_byte(0x0f);
    emit_byte(0xba);
    emit_byte(0xf8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_btc_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_btc_l_rr, (RW4 r, R4 b))
{
    emit_byte(0x0f);
    emit_byte(0xbb);
    emit_byte(0xc0 + 8 * b + r);
}
LENDFUNC(WRITE, NONE, 2, raw_btc_l_rr, (RW4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_btr_l_ri, (RW4 r, IMM i))
{
    emit_byte(0x0f);
    emit_byte(0xba);
    emit_byte(0xf0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_btr_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_btr_l_rr, (RW4 r, R4 b))
{
    emit_byte(0x0f);
    emit_byte(0xb3);
    emit_byte(0xc0 + 8 * b + r);
}
LENDFUNC(WRITE, NONE, 2, raw_btr_l_rr, (RW4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_bts_l_ri, (RW4 r, IMM i))
{
    emit_byte(0x0f);
    emit_byte(0xba);
    emit_byte(0xe8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_bts_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_bts_l_rr, (RW4 r, R4 b))
{
    emit_byte(0x0f);
    emit_byte(0xab);
    emit_byte(0xc0 + 8 * b + r);
}
LENDFUNC(WRITE, NONE, 2, raw_bts_l_rr, (RW4 r, R4 b))

LOWFUNC(WRITE, NONE, 2, raw_sub_w_ri, (RW2 d, IMM i))
{
    emit_byte(0x66);
    if (isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xe8 + d);
        emit_byte(i);
    }
    else
    {
        emit_byte(0x81);
        emit_byte(0xe8 + d);
        emit_word(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_sub_w_ri, (RW2 d, IMM i))

LOWFUNC(NONE, WRITE, 2, raw_mov_l_mi, (MEMW d, IMM s))
{
    emit_byte(0xc7);
    emit_byte(0x05);
    emit_long(d);
    emit_long(s);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_l_mi, (MEMW d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_w_mi, (MEMW d, IMM s))
{
    emit_byte(0x66);
    emit_byte(0xc7);
    emit_byte(0x05);
    emit_long(d);
    emit_word(s);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_w_mi, (MEMW d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_b_mi, (MEMW d, IMM s))
{
    emit_byte(0xc6);
    emit_byte(0x05);
    emit_long(d);
    emit_byte(s);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_b_mi, (MEMW d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_rol_b_mi, (MEMRW d, IMM i))
{
    emit_byte(0xc0);
    emit_byte(0x05);
    emit_long(d);
    emit_byte(i);
}
LENDFUNC(WRITE, RMW, 2, raw_rol_b_mi, (MEMRW d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_b_ri, (RW1 r, IMM i))
{
    emit_byte(0xc0);
    emit_byte(0xc0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xc0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_l_ri, (RW4 r, IMM i))
{
    emit_byte(0xc1);
    emit_byte(0xc0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_rol_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xc0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_rol_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xc0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_rol_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xc0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_rol_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xe0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xe0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xe0 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_b_ri, (RW1 r, IMM i))
{
    emit_byte(0xc0);
    emit_byte(0xc8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_ror_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xc8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_ror_l_ri, (RW4 r, IMM i))
{
    emit_byte(0xc1);
    emit_byte(0xc8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_ror_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xc8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xc8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_ror_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xc8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_ror_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xe8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xe8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shrl_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xe8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_l_rr, (RW4 d, R1 r))
{
    emit_byte(0xd3);
    emit_byte(0xf8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_l_rr, (RW4 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_w_rr, (RW2 d, R1 r))
{
    emit_byte(0x66);
    emit_byte(0xd3);
    emit_byte(0xf8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_w_rr, (RW2 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shra_b_rr, (RW1 d, R1 r))
{
    emit_byte(0xd2);
    emit_byte(0xf8 + d);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_b_rr, (RW1 d, R1 r))

LOWFUNC(WRITE, NONE, 2, raw_shll_l_ri, (RW4 r, IMM i))
{
    emit_byte(0xc1);
    emit_byte(0xe0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shll_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xe0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shll_b_ri, (RW1 r, IMM i))
{
    emit_byte(0xc0);
    emit_byte(0xe0 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shll_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_l_ri, (RW4 r, IMM i))
{
    emit_byte(0xc1);
    emit_byte(0xe8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xe8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shrl_b_ri, (RW1 r, IMM i))
{
    emit_byte(0xc0);
    emit_byte(0xe8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shrl_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_l_ri, (RW4 r, IMM i))
{
    emit_byte(0xc1);
    emit_byte(0xf8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_l_ri, (RW4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_w_ri, (RW2 r, IMM i))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xf8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_w_ri, (RW2 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_shra_b_ri, (RW1 r, IMM i))
{
    emit_byte(0xc0);
    emit_byte(0xf8 + r);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_shra_b_ri, (RW1 r, IMM i))

LOWFUNC(WRITE, NONE, 1, raw_sahf, (R2 dummy_ah))
{
    emit_byte(0x9e);
}
LENDFUNC(WRITE, NONE, 1, raw_sahf, (R2 dummy_ah))

LOWFUNC(NONE, NONE, 1, raw_cpuid, (R4 dummy_eax))
{
    emit_byte(0x0f);
    emit_byte(0xa2);
}
LENDFUNC(NONE, NONE, 1, raw_cpuid, (R4 dummy_eax))

LOWFUNC(READ, NONE, 1, raw_lahf, (W2 dummy_ah))
{
    emit_byte(0x9f);
}
LENDFUNC(READ, NONE, 1, raw_lahf, (W2 dummy_ah))

LOWFUNC(READ, NONE, 2, raw_setcc, (W1 d, IMM cc))
{
    emit_byte(0x0f);
    emit_byte(0x90 + cc);
    emit_byte(0xc0 + d);
}
LENDFUNC(READ, NONE, 2, raw_setcc, (W1 d, IMM cc))

LOWFUNC(READ, WRITE, 2, raw_setcc_m, (MEMW d, IMM cc))
{
    emit_byte(0x0f);
    emit_byte(0x90 + cc);
    emit_byte(0x05);
    emit_long(d);
}
LENDFUNC(READ, WRITE, 2, raw_setcc_m, (MEMW d, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_b_rr, (RW1 d, R1 s, IMM cc))
{
    /* replacement using branch and mov */
    int uncc = (cc ^ 1);
    emit_byte(0x70 + uncc);
    emit_byte(3);  /* skip next 2 bytes if not cc=true */
    emit_byte(0x88);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(READ, NONE, 3, raw_cmov_b_rr, (RW1 d, R1 s, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_w_rr, (RW2 d, R2 s, IMM cc))
{
    if (have_cmov)
    {
        emit_byte(0x66);
        emit_byte(0x0f);
        emit_byte(0x40 + cc);
        emit_byte(0xc0 + 8 * d + s);
    }
    else   /* replacement using branch and mov */
    {
        int uncc = (cc ^ 1);
        emit_byte(0x70 + uncc);
        emit_byte(3); /* skip next 3 bytes if not cc=true */
        emit_byte(0x66);
        emit_byte(0x89);
        emit_byte(0xc0 + 8 * s + d);
    }
}
LENDFUNC(READ, NONE, 3, raw_cmov_w_rr, (RW2 d, R2 s, IMM cc))

LOWFUNC(READ, NONE, 3, raw_cmov_l_rr, (RW4 d, R4 s, IMM cc))
{
    if (have_cmov)
    {
        emit_byte(0x0f);
        emit_byte(0x40 + cc);
        emit_byte(0xc0 + 8 * d + s);
    }
    else   /* replacement using branch and mov */
    {
        int uncc = (cc ^ 1);
        emit_byte(0x70 + uncc);
        emit_byte(2); /* skip next 2 bytes if not cc=true */
        emit_byte(0x89);
        emit_byte(0xc0 + 8 * s + d);
    }
}
LENDFUNC(READ, NONE, 3, raw_cmov_l_rr, (RW4 d, R4 s, IMM cc))

LOWFUNC(WRITE, NONE, 2, raw_bsf_l_rr, (W4 d, R4 s))
{
    emit_byte(0x0f);
    emit_byte(0xbc);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(WRITE, NONE, 2, raw_bsf_l_rr, (W4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_sign_extend_16_rr, (W4 d, R2 s))
{
    emit_byte(0x0f);
    emit_byte(0xbf);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_sign_extend_16_rr, (W4 d, R2 s))

LOWFUNC(NONE, NONE, 2, raw_sign_extend_8_rr, (W4 d, R1 s))
{
    emit_byte(0x0f);
    emit_byte(0xbe);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_sign_extend_8_rr, (W4 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_zero_extend_16_rr, (W4 d, R2 s))
{
    emit_byte(0x0f);
    emit_byte(0xb7);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_zero_extend_16_rr, (W4 d, R2 s))

LOWFUNC(NONE, NONE, 2, raw_zero_extend_8_rr, (W4 d, R1 s))
{
    emit_byte(0x0f);
    emit_byte(0xb6);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_zero_extend_8_rr, (W4 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_imul_32_32, (RW4 d, R4 s))
{
    emit_byte(0x0f);
    emit_byte(0xaf);
    emit_byte(0xc0 + 8 * d + s);
}
LENDFUNC(NONE, NONE, 2, raw_imul_32_32, (RW4 d, R4 s))

LOWFUNC(NONE, NONE, 2, raw_imul_64_32, (RW4 d, RW4 s))
{
    d;

    #ifdef JIT_DEBUG
    if (d != MUL_NREG1 || s != MUL_NREG2)
    {
        Logger::Write(L"JIT: Bad register in IMUL: d=%d, s=%d\n", d, s);
        abort();
    }
    #endif
    emit_byte(0xf7);
    emit_byte(0xea);
}
LENDFUNC(NONE, NONE, 2, raw_imul_64_32, (RW4 d, RW4 s))

LOWFUNC(NONE, NONE, 2, raw_mul_64_32, (RW4 d, RW4 s))
{
    d;

    #ifdef JIT_DEBUG
    if (d != MUL_NREG1 || s != MUL_NREG2)
    {
        Logger::Write(L"JIT: Bad register in MUL: d=%d, s=%d\n", d, s);
        abort();
    }
    #endif
    emit_byte(0xf7);
    emit_byte(0xe2);
}
LENDFUNC(NONE, NONE, 2, raw_mul_64_32, (RW4 d, RW4 s))

LOWFUNC(NONE, NONE, 2, raw_mov_b_rr, (W1 d, R1 s))
{
    emit_byte(0x88);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_b_rr, (W1 d, R1 s))

LOWFUNC(NONE, NONE, 2, raw_mov_w_rr, (W2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_w_rr, (W2 d, R2 s))

LOWFUNC(NONE, READ, 3, raw_mov_l_rrm_indexed, (W4 d, R4 baser, R4 index))
{
    emit_byte(0x8b);
    if (baser == 5)
    {
        emit_byte(0x44 + 8 * d);
        emit_byte(8 * index + baser);
        emit_byte(0);
        return;
    }
    emit_byte(0x04 + 8 * d);
    emit_byte(8 * index + baser);
}
LENDFUNC(NONE, READ, 3, raw_mov_l_rrm_indexed, (W4 d, R4 baser, R4 index))

LOWFUNC(NONE, READ, 3, raw_mov_w_rrm_indexed, (W2 d, R4 baser, R4 index))
{
    emit_byte(0x66);
    emit_byte(0x8b);
    if (baser == 5)
    {
        emit_byte(0x44 + 8 * d);
        emit_byte(8 * index + baser);
        emit_byte(0);
        return;
    }
    emit_byte(0x04 + 8 * d);
    emit_byte(8 * index + baser);
}
LENDFUNC(NONE, READ, 3, raw_mov_w_rrm_indexed, (W2 d, R4 baser, R4 index))

LOWFUNC(NONE, READ, 3, raw_mov_b_rrm_indexed, (W1 d, R4 baser, R4 index))
{
    emit_byte(0x8a);
    if (baser == 5)
    {
        emit_byte(0x44 + 8 * d);
        emit_byte(8 * index + baser);
        emit_byte(0);
        return;
    }
    emit_byte(0x04 + 8 * d);
    emit_byte(8 * index + baser);
}
LENDFUNC(NONE, READ, 3, raw_mov_b_rrm_indexed, (W1 d, R4 baser, R4 index))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_mrr_indexed, (R4 baser, R4 index, R4 s))
{
    emit_byte(0x89);
    if (baser == 5)
    {
        emit_byte(0x44 + 8 * s);
        emit_byte(8 * index + baser);
        emit_byte(0);
        return;
    }
    emit_byte(0x04 + 8 * s);
    emit_byte(8 * index + baser);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_mrr_indexed, (R4 baser, R4 index, R4 s))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_mrr_indexed, (R4 baser, R4 index, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x89);
    if (baser == 5)
    {
        emit_byte(0x44 + 8 * s);
        emit_byte(8 * index + baser);
        emit_byte(0);
        return;
    }
    emit_byte(0x04 + 8 * s);
    emit_byte(8 * index + baser);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_mrr_indexed, (R4 baser, R4 index, R2 s))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_mrr_indexed, (R4 baser, R4 index, R1 s))
{
    emit_byte(0x88);
    if (baser == 5)
    {
        emit_byte(0x44 + 8 * s);
        emit_byte(8 * index + baser);
        emit_byte(0);
        return;
    }
    emit_byte(0x04 + 8 * s);
    emit_byte(8 * index + baser);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_mrr_indexed, (R4 baser, R4 index, R1 s))

LOWFUNC(NONE, READ, 3, raw_mov_l_rm_indexed, (W4 d, IMM base, R4 index))
{
    emit_byte(0x8b);
    emit_byte(0x04 + 8 * d);
    emit_byte(0x85 + 8 * index);
    emit_long(base);
}
LENDFUNC(NONE, READ, 3, raw_mov_l_rm_indexed, (W4 d, IMM base, R4 index))

LOWFUNC(NONE, READ, 4, raw_cmov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM cond))
{
    if (have_cmov)
    {
        emit_byte(0x0f);
        emit_byte(0x40 + cond);
    }
    else   /* replacement using branch and mov */
    {
        int uncc = (cond ^ 1);
        emit_byte(0x70 + uncc);
        emit_byte(7); /* skip next 7 bytes if not cc=true */
        emit_byte(0x8b);
    }
    emit_byte(0x04 + 8 * d);
    emit_byte(0x85 + 8 * index);
    emit_long(base);
}
LENDFUNC(NONE, READ, 4, raw_cmov_l_rm_indexed, (W4 d, IMM base, R4 index, IMM cond))

LOWFUNC(NONE, READ, 3, raw_cmov_l_rm, (W4 d, IMM mem, IMM cond))
{
    if (have_cmov)
    {
        emit_byte(0x0f);
        emit_byte(0x40 + cond);
        emit_byte(0x05 + 8 * d);
        emit_long(mem);
    }
    else   /* replacement using branch and mov */
    {
        int uncc = (cond ^ 1);
        emit_byte(0x70 + uncc);
        emit_byte(6); /* skip next 6 bytes if not cc=true */
        emit_byte(0x8b);
        emit_byte(0x05 + 8 * d);
        emit_long(mem);
    }
}
LENDFUNC(NONE, READ, 3, raw_cmov_l_rm, (W4 d, IMM mem, IMM cond))

LOWFUNC(NONE, READ, 3, raw_mov_l_rR, (W4 d, R4 s, IMM offset))
{
    emit_byte(0x8b);
    emit_byte(0x40 + 8 * d + s);
    emit_byte(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_l_rR, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_w_rR, (W2 d, R4 s, IMM offset))
{
    emit_byte(0x66);
    emit_byte(0x8b);
    emit_byte(0x40 + 8 * d + s);
    emit_byte(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_w_rR, (W2 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_b_rR, (W1 d, R4 s, IMM offset))
{
    emit_byte(0x8a);
    emit_byte(0x40 + 8 * d + s);
    emit_byte(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_b_rR, (W1 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_l_brR, (W4 d, R4 s, IMM offset))
{
    emit_byte(0x8b);
    emit_byte(0x80 + 8 * d + s);
    emit_long(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_l_brR, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_w_brR, (W2 d, R4 s, IMM offset))
{
    emit_byte(0x66);
    emit_byte(0x8b);
    emit_byte(0x80 + 8 * d + s);
    emit_long(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_w_brR, (W2 d, R4 s, IMM offset))

LOWFUNC(NONE, READ, 3, raw_mov_b_brR, (W1 d, R4 s, IMM offset))
{
    emit_byte(0x8a);
    emit_byte(0x80 + 8 * d + s);
    emit_long(offset);
}
LENDFUNC(NONE, READ, 3, raw_mov_b_brR, (W1 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_Ri, (R4 d, IMM i, IMM offset))
{
    emit_byte(0xc7);
    emit_byte(0x40 + d);
    emit_byte(offset);
    emit_long(i);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_Ri, (R4 d, IMM i, IMM offset))
{
    emit_byte(0x66);
    emit_byte(0xc7);
    emit_byte(0x40 + d);
    emit_byte(offset);
    emit_word(i);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_Ri, (R4 d, IMM i, IMM offset))
{
    emit_byte(0xc6);
    emit_byte(0x40 + d);
    emit_byte(offset);
    emit_byte(i);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_Ri, (R4 d, IMM i, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_Rr, (R4 d, R4 s, IMM offset))
{
    emit_byte(0x89);
    emit_byte(0x40 + 8 * s + d);
    emit_byte(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_Rr, (R4 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_Rr, (R4 d, R2 s, IMM offset))
{
    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0x40 + 8 * s + d);
    emit_byte(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_Rr, (R4 d, R2 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_Rr, (R4 d, R1 s, IMM offset))
{
    emit_byte(0x88);
    emit_byte(0x40 + 8 * s + d);
    emit_byte(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_Rr, (R4 d, R1 s, IMM offset))

LOWFUNC(NONE, NONE, 3, raw_lea_l_brr, (W4 d, R4 s, IMM offset))
{
    emit_byte(0x8d);
    emit_byte(0x80 + 8 * d + s);
    emit_long(offset);
}
LENDFUNC(NONE, NONE, 3, raw_lea_l_brr, (W4 d, R4 s, IMM offset))

LOWFUNC(NONE, NONE, 5, raw_lea_l_brr_indexed, (W4 d, R4 s, R4 index, IMM factor, IMM offset))
{
    emit_byte(0x8d);
    if (!offset)
    {
        if (s != 5)
        {
            emit_byte(0x04 + 8 * d);
            emit_byte(0x40 * factor + 8 * index + s);
            return;
        }
        emit_byte(0x44 + 8 * d);
        emit_byte(0x40 * factor + 8 * index + s);
        emit_byte(0);
        return;
    }
    emit_byte(0x84 + 8 * d);
    emit_byte(0x40 * factor + 8 * index + s);
    emit_long(offset);
}
LENDFUNC(NONE, NONE, 5, raw_lea_l_brr_indexed, (W4 d, R4 s, R4 index, IMM factor, IMM offset))

LOWFUNC(NONE, NONE, 3, raw_lea_l_rr_indexed, (W4 d, R4 s, R4 index))
{
    emit_byte(0x8d);
    if (s == 5)
    {
        emit_byte(0x44 + 8 * d);
        emit_byte(8 * index + s);
        emit_byte(0);
        return;
    }
    emit_byte(0x04 + 8 * d);
    emit_byte(8 * index + s);
}
LENDFUNC(NONE, NONE, 3, raw_lea_l_rr_indexed, (W4 d, R4 s, R4 index))

LOWFUNC(NONE, WRITE, 3, raw_mov_l_bRr, (R4 d, R4 s, IMM offset))
{
    emit_byte(0x89);
    emit_byte(0x80 + 8 * s + d);
    emit_long(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_l_bRr, (R4 d, R4 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_w_bRr, (R4 d, R2 s, IMM offset))
{
    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0x80 + 8 * s + d);
    emit_long(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_w_bRr, (R4 d, R2 s, IMM offset))

LOWFUNC(NONE, WRITE, 3, raw_mov_b_bRr, (R4 d, R1 s, IMM offset))
{
    emit_byte(0x88);
    emit_byte(0x80 + 8 * s + d);
    emit_long(offset);
}
LENDFUNC(NONE, WRITE, 3, raw_mov_b_bRr, (R4 d, R1 s, IMM offset))

LOWFUNC(NONE, NONE, 1, raw_bswap_32, (RW4 r))
{
    emit_byte(0x0f);
    emit_byte(0xc8 + r);
}
LENDFUNC(NONE, NONE, 1, raw_bswap_32, (RW4 r))

LOWFUNC(WRITE, NONE, 1, raw_bswap_16, (RW2 r))
{
    emit_byte(0x66);
    emit_byte(0xc1);
    emit_byte(0xc0 + r);
    emit_byte(0x08);
}
LENDFUNC(WRITE, NONE, 1, raw_bswap_16, (RW2 r))

LOWFUNC(NONE, NONE, 2, raw_mov_l_rr, (W4 d, R4 s))
{
    emit_byte(0x89);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(NONE, NONE, 2, raw_mov_l_rr, (W4 d, R4 s))

LOWFUNC(NONE, WRITE, 2, raw_mov_l_mr, (IMM d, R4 s))
{
    emit_byte(0x89);
    emit_byte(0x05 + 8 * s);
    emit_long(d);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_l_mr, (IMM d, R4 s))

LOWFUNC(NONE, READ, 2, raw_mov_l_rm, (W4 d, MEMR s))
{
    emit_byte(0x8b);
    emit_byte(0x05 + 8 * d);
    emit_long(s);
}
LENDFUNC(NONE, READ, 2, raw_mov_l_rm, (W4 d, MEMR s))

LOWFUNC(NONE, WRITE, 2, raw_mov_w_mr, (IMM d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x89);
    emit_byte(0x05 + 8 * s);
    emit_long(d);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_w_mr, (IMM d, R2 s))

LOWFUNC(NONE, READ, 2, raw_mov_w_rm, (W2 d, IMM s))
{
    emit_byte(0x66);
    emit_byte(0x8b);
    emit_byte(0x05 + 8 * d);
    emit_long(s);
}
LENDFUNC(NONE, READ, 2, raw_mov_w_rm, (W2 d, IMM s))

LOWFUNC(NONE, WRITE, 2, raw_mov_b_mr, (IMM d, R1 s))
{
    emit_byte(0x88);
    emit_byte(0x05 + 8 * s);
    emit_long(d);
}
LENDFUNC(NONE, WRITE, 2, raw_mov_b_mr, (IMM d, R1 s))

LOWFUNC(NONE, READ, 2, raw_mov_b_rm, (W1 d, IMM s))
{
    emit_byte(0x8a);
    emit_byte(0x05 + 8 * d);
    emit_long(s);
}
LENDFUNC(NONE, READ, 2, raw_mov_b_rm, (W1 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_l_ri, (W4 d, IMM s))
{
    emit_byte(0xb8 + d);
    emit_long(s);
}
LENDFUNC(NONE, NONE, 2, raw_mov_l_ri, (W4 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_w_ri, (W2 d, IMM s))
{
    emit_byte(0x66);
    emit_byte(0xb8 + d);
    emit_word(s);
}
LENDFUNC(NONE, NONE, 2, raw_mov_w_ri, (W2 d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_mov_b_ri, (W1 d, IMM s))
{
    emit_byte(0xb0 + d);
    emit_byte(s);
}
LENDFUNC(NONE, NONE, 2, raw_mov_b_ri, (W1 d, IMM s))

LOWFUNC(RMW, RMW, 2, raw_adc_l_mi, (MEMRW d, IMM s))
{
    emit_byte(0x81);
    emit_byte(0x15);
    emit_long(d);
    emit_long(s);
}
LENDFUNC(RMW, RMW, 2, raw_adc_l_mi, (MEMRW d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_l_mi, (IMM d, IMM s))
{
    emit_byte(0x81);
    emit_byte(0x05);
    emit_long(d);
    emit_long(s);
}
LENDFUNC(WRITE, RMW, 2, raw_add_l_mi, (IMM d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_w_mi, (IMM d, IMM s))
{
    emit_byte(0x66);
    emit_byte(0x81);
    emit_byte(0x05);
    emit_long(d);
    emit_word(s);
}
LENDFUNC(WRITE, RMW, 2, raw_add_w_mi, (IMM d, IMM s))

LOWFUNC(WRITE, RMW, 2, raw_add_b_mi, (IMM d, IMM s))
{
    emit_byte(0x80);
    emit_byte(0x05);
    emit_long(d);
    emit_byte(s);
}
LENDFUNC(WRITE, RMW, 2, raw_add_b_mi, (IMM d, IMM s))

LOWFUNC(WRITE, NONE, 2, raw_test_l_ri, (R4 d, IMM i))
{
    emit_byte(0xf7);
    emit_byte(0xc0 + d);
    emit_long(i);
}
LENDFUNC(WRITE, NONE, 2, raw_test_l_ri, (R4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_test_l_rr, (R4 d, R4 s))
{
    emit_byte(0x85);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_l_rr, (R4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_test_w_rr, (R2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x85);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_w_rr, (R2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_test_b_rr, (R1 d, R1 s))
{
    emit_byte(0x84);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_test_b_rr, (R1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_and_l_ri, (RW4 d, IMM i))
{
    emit_byte(0x81);
    emit_byte(0xe0 + d);
    emit_long(i);
}
LENDFUNC(WRITE, NONE, 2, raw_and_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_and_w_ri, (RW2 d, IMM i))
{
    emit_byte(0x66);
    emit_byte(0x81);
    emit_byte(0xe0 + d);
    emit_word(i);
}
LENDFUNC(WRITE, NONE, 2, raw_and_w_ri, (RW2 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_and_l, (RW4 d, R4 s))
{
    emit_byte(0x21);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_and_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x21);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_and_b, (RW1 d, R1 s))
{
    emit_byte(0x20);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_and_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_or_l_ri, (RW4 d, IMM i))
{
    emit_byte(0x81);
    emit_byte(0xc8 + d);
    emit_long(i);
}
LENDFUNC(WRITE, NONE, 2, raw_or_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_or_l, (RW4 d, R4 s))
{
    emit_byte(0x09);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_or_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x09);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_or_b, (RW1 d, R1 s))
{
    emit_byte(0x08);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_or_b, (RW1 d, R1 s))

LOWFUNC(RMW, NONE, 2, raw_adc_l, (RW4 d, R4 s))
{
    emit_byte(0x11);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_l, (RW4 d, R4 s))

LOWFUNC(RMW, NONE, 2, raw_adc_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x11);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_w, (RW2 d, R2 s))

LOWFUNC(RMW, NONE, 2, raw_adc_b, (RW1 d, R1 s))
{
    emit_byte(0x10);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_adc_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_add_l, (RW4 d, R4 s))
{
    emit_byte(0x01);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_add_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x01);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_add_b, (RW1 d, R1 s))
{
    emit_byte(0x00);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_add_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_l_ri, (RW4 d, IMM i))
{
    if (isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xe8 + d);
        emit_byte(i);
    }
    else
    {
        emit_byte(0x81);
        emit_byte(0xe8 + d);
        emit_long(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_sub_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_sub_b_ri, (RW1 d, IMM i))
{
    emit_byte(0x80);
    emit_byte(0xe8 + d);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_b_ri, (RW1 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_l_ri, (RW4 d, IMM i))
{
    if (isbyte(i))
    {
        emit_byte(0x83);
        emit_byte(0xc0 + d);
        emit_byte(i);
    }
    else
    {
        emit_byte(0x81);
        emit_byte(0xc0 + d);
        emit_long(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_add_l_ri, (RW4 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_w_ri, (RW2 d, IMM i))
{
    if (isbyte(i))
    {
        emit_byte(0x66);
        emit_byte(0x83);
        emit_byte(0xc0 + d);
        emit_byte(i);
    }
    else
    {
        emit_byte(0x66);
        emit_byte(0x81);
        emit_byte(0xc0 + d);
        emit_word(i);
    }
}
LENDFUNC(WRITE, NONE, 2, raw_add_w_ri, (RW2 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_add_b_ri, (RW1 d, IMM i))
{
    emit_byte(0x80);
    emit_byte(0xc0 + d);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_add_b_ri, (RW1 d, IMM i))

LOWFUNC(RMW, NONE, 2, raw_sbb_l, (RW4 d, R4 s))
{
    emit_byte(0x19);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_l, (RW4 d, R4 s))

LOWFUNC(RMW, NONE, 2, raw_sbb_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x19);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_w, (RW2 d, R2 s))

LOWFUNC(RMW, NONE, 2, raw_sbb_b, (RW1 d, R1 s))
{
    emit_byte(0x18);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(RMW, NONE, 2, raw_sbb_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_l, (RW4 d, R4 s))
{
    emit_byte(0x29);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x29);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_sub_b, (RW1 d, R1 s))
{
    emit_byte(0x28);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_sub_b, (RW1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_l, (R4 d, R4 s))
{
    emit_byte(0x39);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_l, (R4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_l_ri, (R4 r, IMM i))
{
    emit_byte(0x81);
    emit_byte(0xf8 + r);
    emit_long(i);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_l_ri, (R4 r, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_cmp_w, (R2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x39);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_w, (R2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_cmp_b_ri, (R1 d, IMM i))
{
    emit_byte(0x80);
    emit_byte(0xf8 + d);
    emit_byte(i);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_b_ri, (R1 d, IMM i))

LOWFUNC(WRITE, NONE, 2, raw_cmp_b, (R1 d, R1 s))
{
    emit_byte(0x38);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_cmp_b, (R1 d, R1 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_l, (RW4 d, R4 s))
{
    emit_byte(0x31);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_l, (RW4 d, R4 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_w, (RW2 d, R2 s))
{
    emit_byte(0x66);
    emit_byte(0x31);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_w, (RW2 d, R2 s))

LOWFUNC(WRITE, NONE, 2, raw_xor_b, (RW1 d, R1 s))
{
    emit_byte(0x30);
    emit_byte(0xc0 + 8 * s + d);
}
LENDFUNC(WRITE, NONE, 2, raw_xor_b, (RW1 d, R1 s))

LOWFUNC(WRITE, RMW, 2, raw_sub_l_mi, (MEMRW d, IMM s))
{
    emit_byte(0x81);
    emit_byte(0x2d);
    emit_long(d);
    emit_long(s);
}
LENDFUNC(WRITE, RMW, 2, raw_sub_l_mi, (MEMRW d, IMM s))

LOWFUNC(WRITE, READ, 2, raw_cmp_l_mi, (MEMR d, IMM s))
{
    emit_byte(0x81);
    emit_byte(0x3d);
    emit_long(d);
    emit_long(s);
}
LENDFUNC(WRITE, READ, 2, raw_cmp_l_mi, (MEMR d, IMM s))

LOWFUNC(NONE, NONE, 2, raw_xchg_l_rr, (RW4 r1, RW4 r2))
{
    emit_byte(0x87);
    emit_byte(0xc0 + 8 * r1 + r2);
}
LENDFUNC(NONE, NONE, 2, raw_xchg_l_rr, (RW4 r1, RW4 r2))

LOWFUNC(READ, WRITE, 0, raw_pushfl, (void))
{
    emit_byte(0x9c);
}
LENDFUNC(READ, WRITE, 0, raw_pushfl, (void))

LOWFUNC(WRITE, READ, 0, raw_popfl, (void))
{
    emit_byte(0x9d);
}
LENDFUNC(WRITE, READ, 0, raw_popfl, (void))

/*************************************************************************
* Unoptimizable stuff --- jump                                          *
*************************************************************************/

static __forceinline void raw_call_r(R4 r)
{
    lopt_emit_all();
    emit_byte(0xff);
    emit_byte(0xd0 + r);
}

static __forceinline void raw_jmp_r(R4 r)
{
    lopt_emit_all();
    emit_byte(0xff);
    emit_byte(0xe0 + r);
}

static __forceinline void raw_jmp_m_indexed(uint base, uint r, uint m)
{
    int sib = 0;

    switch (m)
    {
        case 1: sib = 0x05;
            break;
        case 2: sib = 0x45;
            break;
        case 4: sib = 0x85;
            break;
        case 8: sib = 0xC5;
            break;
        default: abort();
    }
    lopt_emit_all();
    emit_byte(0xff);
    emit_byte(0x24);
    emit_byte(8 * r + sib);
    emit_long(base);
}

static __forceinline void raw_jmp_m(uint base)
{
    lopt_emit_all();
    emit_byte(0xff);
    emit_byte(0x25);
    emit_long(base);
}

static __forceinline void raw_call(uint t)
{
    lopt_emit_all();
    emit_byte(0xe8);
    emit_long(t - (uint)target - 4);
}

static __forceinline void raw_jmp(uint t)
{
    lopt_emit_all();
    emit_byte(0xe9);
    emit_long(t - (uint)target - 4);
}

static __forceinline void raw_jl(uint t)
{
    lopt_emit_all();
    emit_byte(0x0f);
    emit_byte(0x8c);
    emit_long(t - (uint)target - 4);
}

static __forceinline void raw_jz(uint t)
{
    lopt_emit_all();
    emit_byte(0x0f);
    emit_byte(0x84);
    emit_long(t - (uint)target - 4);
}

static __forceinline void raw_jnz(uint t)
{
    lopt_emit_all();
    emit_byte(0x0f);
    emit_byte(0x85);
    emit_long(t - (uint)target - 4);
}

static __forceinline void raw_jnz_l_oponly(void)
{
    lopt_emit_all();
    emit_byte(0x0f);
    emit_byte(0x85);
}

static __forceinline void raw_jcc_l_oponly(int cc)
{
    lopt_emit_all();
    emit_byte(0x0f);
    emit_byte(0x80 + cc);
}

static __forceinline void raw_jnz_b_oponly(void)
{
    lopt_emit_all();
    emit_byte(0x75);
}

static __forceinline void raw_jz_b_oponly(void)
{
    lopt_emit_all();
    emit_byte(0x74);
}

static __forceinline void raw_jmp_l_oponly(void)
{
    lopt_emit_all();
    emit_byte(0xe9);
}

static __forceinline void raw_jmp_b_oponly(void)
{
    lopt_emit_all();
    emit_byte(0xeb);
}

static __forceinline void raw_ret(void)
{
    lopt_emit_all();
    emit_byte(0xc3);
}

static __forceinline void raw_nop(void)
{
    lopt_emit_all();
    emit_byte(0x90);
}

/*************************************************************************
* Flag handling, to and fro UAE flag register                           *
*************************************************************************/

#define FLAG_NREG1 0  /* Set to -1 if any register will do */

static __forceinline void raw_flags_to_reg(int r)
{
    raw_lahf(0); /* Most flags in AH */
    // raw_setcc(r,0); /* V flag in AL */
    raw_setcc_m((uint)live.state[FLAGTMP].mem, 0);

    #if 1 /* Let's avoid those nasty partial register stalls */
    // raw_mov_b_mr((uint)live.state[FLAGTMP].mem,r);
    raw_mov_b_mr(((uint)live.state[FLAGTMP].mem) + 1, r + 4);
    // live.state[FLAGTMP].status=CLEAN;
    live.state[FLAGTMP].status = INMEM;
    live.state[FLAGTMP].realreg = -1;
    /* We just "evicted" FLAGTMP. */
    if (live.nat[r].nholds != 1)
    {
        /* Huh? */
        abort();
    }
    live.nat[r].nholds = 0;
    #endif
}

#define FLAG_NREG2 0  /* Set to -1 if any register will do */
static __forceinline void raw_reg_to_flags(int r)
{
    raw_cmp_b_ri(r, -127); /* set V */
    raw_sahf(0);
}

/* Apparently, there are enough instructions between flag store and
   flag reload to avoid the partial memory stall */
static __forceinline void raw_load_flagreg(uint target, uint r)
{
    #if 1
    raw_mov_l_rm(target, (uint)live.state[r].mem);
    #else
    raw_mov_b_rm(target, (uint)live.state[r].mem);
    raw_mov_b_rm(target + 4, ((uint)live.state[r].mem) + 1);
    #endif
}

/* FLAGX is word-sized */
static __forceinline void raw_load_flagx(uint target, uint r)
{
    if (live.nat[target].canword)
        raw_mov_w_rm(target, (uint)live.state[r].mem);
    else
        raw_mov_l_rm(target, (uint)live.state[r].mem);
}

#define NATIVE_FLAG_Z 0x40
#define NATIVE_CC_EQ 4
static __forceinline void raw_flags_set_zero(int f, int r, int t)
{
    //  FIXME: this is really suboptimal
    raw_pushfl();
    raw_pop_l_r(f);
    raw_and_l_ri(f, ~NATIVE_FLAG_Z);
    raw_test_l_rr(r, r);
    raw_mov_l_ri(r, 0);
    raw_mov_l_ri(t, NATIVE_FLAG_Z);
    raw_cmov_l_rr(r, t, NATIVE_CC_EQ);
    raw_or_l(f, r);
    raw_push_l_r(f);
    raw_popfl();
}

static __forceinline void raw_inc_sp(int off)
{
    raw_add_l_ri(4, off);
}

/*************************************************************************
* Checking for CPU features                                             *
*************************************************************************/

struct cpuinfo_x86
{
    byte x86;             // CPU family
    byte x86_vendor;      // CPU vendor
    byte x86_processor;   // CPU canonical processor type
    byte x86_brand_id;    // CPU BrandID if supported, yield 0 otherwise
    uint x86_hwcap;
    byte x86_model;
    byte x86_mask;
    int cpuid_level;        // Maximum supported CPUID level, -1=no CPUID
    char x86_vendor_id[16];
};
struct cpuinfo_x86 cpuinfo;

enum
{
    X86_VENDOR_INTEL= 0,
    X86_VENDOR_CYRIX= 1,
    X86_VENDOR_AMD= 2,
    X86_VENDOR_UMC= 3,
    X86_VENDOR_NEXGEN= 4,
    X86_VENDOR_CENTAUR= 5,
    X86_VENDOR_RISE= 6,
    X86_VENDOR_TRANSMETA= 7,
    X86_VENDOR_NSC= 8,
    X86_VENDOR_UNKNOWN= 0xff
};

enum
{
    X86_PROCESSOR_I386,                     /* 80386 */
    X86_PROCESSOR_I486,                     /* 80486DX, 80486SX, 80486DX[24] */
    X86_PROCESSOR_PENTIUM,
    X86_PROCESSOR_PENTIUMPRO,
    X86_PROCESSOR_K6,
    X86_PROCESSOR_ATHLON,
    X86_PROCESSOR_PENTIUM4,
    X86_PROCESSOR_K8,
    X86_PROCESSOR_max
};

struct x86_alignments_struct
{
    int align_loop;
    int align_loop_max_skip;
    int align_jump;
    int align_jump_max_skip;
    int align_func;
};

static x86_alignments_struct x86_alignments[X86_PROCESSOR_max + 1] = {
    { 4, 3, 4, 3, 4 },
    { 16, 15, 16, 15, 16 },
    { 16, 7, 16, 7, 16 },
    { 16, 15, 16, 7, 16 },
    { 32, 7, 32, 7, 32 },
    { 16, 7, 16, 7, 16 },
    { 0, 0, 0, 0, 0 },
    { 16, 7, 16, 7, 16 },
    { 0, 0, 0, 0, 0 }
};

static void
x86_get_cpu_vendor(struct cpuinfo_x86* c)
{
    char* v = c->x86_vendor_id;

    if (!strcmp(v, "GenuineIntel"))
        c->x86_vendor = X86_VENDOR_INTEL;
    else if (!strcmp(v, "AuthenticAMD"))
        c->x86_vendor = X86_VENDOR_AMD;
    else if (!strcmp(v, "CyrixInstead"))
        c->x86_vendor = X86_VENDOR_CYRIX;
    else if (!strcmp(v, "Geode by NSC"))
        c->x86_vendor = X86_VENDOR_NSC;
    else if (!strcmp(v, "UMC UMC UMC "))
        c->x86_vendor = X86_VENDOR_UMC;
    else if (!strcmp(v, "CentaurHauls"))
        c->x86_vendor = X86_VENDOR_CENTAUR;
    else if (!strcmp(v, "NexGenDriven"))
        c->x86_vendor = X86_VENDOR_NEXGEN;
    else if (!strcmp(v, "RiseRiseRise"))
        c->x86_vendor = X86_VENDOR_RISE;
    else if (!strcmp(v, "GenuineTMx86") ||
        !strcmp(v, "TransmetaCPU"))
        c->x86_vendor = X86_VENDOR_TRANSMETA;
    else
        c->x86_vendor = X86_VENDOR_UNKNOWN;
}

static void cpuid(uint op, uint* eax, uint* ebx, uint* ecx, uint* edx)
{
    const int CPUID_SPACE = 4096;
    byte* cpuid_space = (byte*)cache_alloc(CPUID_SPACE);
    static uint s_op, s_eax, s_ebx, s_ecx, s_edx;
    byte* tmp = get_target();

    s_op = op;
    set_target(cpuid_space);
    raw_push_l_r(0); /* eax */
    raw_push_l_r(1); /* ecx */
    raw_push_l_r(2); /* edx */
    raw_push_l_r(3); /* ebx */
    raw_mov_l_rm(0, (uintptr) & s_op);
    raw_cpuid(0);
    raw_mov_l_mr((uintptr) & s_eax, 0);
    raw_mov_l_mr((uintptr) & s_ebx, 3);
    raw_mov_l_mr((uintptr) & s_ecx, 1);
    raw_mov_l_mr((uintptr) & s_edx, 2);
    raw_pop_l_r(3);
    raw_pop_l_r(2);
    raw_pop_l_r(1);
    raw_pop_l_r(0);
    raw_ret();
    set_target(tmp);

    ((compop_func*)cpuid_space)(0);
    if (eax != NULL)
        *eax = s_eax;
    if (ebx != NULL)
        *ebx = s_ebx;
    if (ecx != NULL)
        *ecx = s_ecx;
    if (edx != NULL)
        *edx = s_edx;

    cache_free(cpuid_space);
}

static void raw_init_cpu(void)
{
    struct cpuinfo_x86* c = &cpuinfo;
    uint xlvl;

    /* Defaults */
    c->x86_processor = X86_PROCESSOR_max;
    c->x86_vendor = X86_VENDOR_UNKNOWN;
    c->cpuid_level = -1;            /* CPUID not detected */
    c->x86_model = c->x86_mask = 0; /* So far unknown... */
    c->x86_vendor_id[0] = '\0';     /* Unset */
    c->x86_hwcap = 0;

    /* Get vendor name */
    c->x86_vendor_id[12] = '\0';
    cpuid(0x00000000,
        (uint*)&c->cpuid_level,
        (uint*)&c->x86_vendor_id[0],
        (uint*)&c->x86_vendor_id[8],
        (uint*)&c->x86_vendor_id[4]);
    x86_get_cpu_vendor(c);

    /* Intel-defined flags: level 0x00000001 */
    c->x86_brand_id = 0;
    if (c->cpuid_level >= 0x00000001)
    {
        uint tfms, brand_id;
        cpuid(0x00000001, &tfms, &brand_id, NULL, &c->x86_hwcap);
        c->x86 = (tfms >> 8) & 15;
        c->x86_model = (tfms >> 4) & 15;
        c->x86_brand_id = brand_id & 0xff;
        if ((c->x86_vendor == X86_VENDOR_AMD) &&
            (c->x86 == 0xf))
        {
            /* AMD Extended Family and Model Values */
            c->x86 += (tfms >> 20) & 0xff;
            c->x86_model += (tfms >> 12) & 0xf0;
        }
        c->x86_mask = tfms & 15;
    }
    else
    {
        /* Have CPUID level 0 only - unheard of */
        c->x86 = 4;
    }

    /* AMD-defined flags: level 0x80000001 */
    cpuid(0x80000000, &xlvl, NULL, NULL, NULL);
    if ((xlvl & 0xffff0000) == 0x80000000)
    {
        if (xlvl >= 0x80000001)
        {
            uint features;
            cpuid(0x80000001, NULL, NULL, NULL, &features);
            if (features & (1 << 29))
            {
                /* Assume x86-64 if long mode is supported */
                c->x86_processor = X86_PROCESSOR_K8;
            }
        }
    }

    /* Canonicalize processor ID */
    switch (c->x86)
    {
        case 3:
            c->x86_processor = X86_PROCESSOR_I386;
            break;
        case 4:
            c->x86_processor = X86_PROCESSOR_I486;
            break;
        case 5:
            if (c->x86_vendor == X86_VENDOR_AMD)
                c->x86_processor = X86_PROCESSOR_K6;
            else
                c->x86_processor = X86_PROCESSOR_PENTIUM;
            break;
        case 6:
            if (c->x86_vendor == X86_VENDOR_AMD)
                c->x86_processor = X86_PROCESSOR_ATHLON;
            else
                c->x86_processor = X86_PROCESSOR_PENTIUMPRO;
            break;
        case 15:
            if (c->x86_vendor == X86_VENDOR_INTEL)
            {
                /* Assume any BrandID >= 8 and family == 15 yields a Pentium 4 */
                if (c->x86_brand_id >= 8)
                    c->x86_processor = X86_PROCESSOR_PENTIUM4;
            }
            if (c->x86_vendor == X86_VENDOR_AMD)
            {
                /* Assume an Athlon processor if family == 15 and it was not
                   detected as an x86-64 so far */
                if (c->x86_processor == X86_PROCESSOR_max)
                    c->x86_processor = X86_PROCESSOR_ATHLON;
            }
            break;
    }

    /* Have CMOV support? */
    have_cmov = c->x86_hwcap & (1 << 15);

    #if 0
    /* Can the host CPU suffer from partial register stalls? */
    have_rat_stall = (c->x86_vendor == X86_VENDOR_INTEL);
    /* It appears that partial register writes are a bad idea even on
       AMD K7 cores, even though they are not supposed to have the
       dreaded rat stall. Why? Anyway, that's why we lie about it ;-) */
    if (c->x86_processor == X86_PROCESSOR_ATHLON)
        have_rat_stall = 1;
    #endif
    have_rat_stall = 1;

    /* Alignments */
    if (tune_alignment)
    {
        align_loops = x86_alignments[c->x86_processor].align_loop;
        align_jumps = x86_alignments[c->x86_processor].align_jump;
    }
    {
        TCHAR* s = Unicode::au(c->x86_vendor_id);
        Logger::Write(L"CPUID level=%d, Family=%d, Model=%d, Mask=%d, Vendor=%s [%d]\n",
            c->cpuid_level, c->x86, c->x86_model, c->x86_mask, s, c->x86_vendor);
        free(s);
    }
}

#if 0
static int target_check_bsf(void)
{
    int mismatch = 0;
    for (int g_ZF = 0; g_ZF <= 1; g_ZF++)
    {
        for (int g_CF = 0; g_CF <= 1; g_CF++)
        {
            for (int g_OF = 0; g_OF <= 1; g_OF++)
            {
                for (int g_SF = 0; g_SF <= 1; g_SF++)
                {
                    for (int value = -1; value <= 1; value++)
                    {
                        uint flags = (g_SF << 7) | (g_OF << 11) | (g_ZF << 6) | g_CF;
                        uint tmp = value;
                        __asm__ volatile ("push %0; popf; bsf %1,%1; pushf; pop %0"
                            : "+r" (flags), "+r" (tmp) : : "cc");
                        int OF = (flags >> 11) & 1;
                        int SF = (flags >> 7) & 1;
                        int ZF = (flags >> 6) & 1;
                        int CF = flags & 1;
                        tmp = (value == 0);
                        if (ZF != tmp || SF != g_SF || OF != g_OF || CF != g_CF)
                            mismatch = true;
                    }
                }
            }
        }
    }
    if (mismatch)
        Logger::Write(L"Target CPU defines all flags on BSF instruction\n");
    return !mismatch;
}
#endif

#if 0

/*************************************************************************
* Checking for CPU features                                             *
*************************************************************************/

typedef struct
{
    uint eax;
    uint ecx;
    uint edx;
    uint ebx;
} x86_regs;

/* This could be so much easier if it could make assumptions about the
   compiler... */

static uint cpuid_ptr;
static uint cpuid_level;

static x86_regs cpuid(uint level)
{
    x86_regs answer;
    byte* cpuid_space;
    void* tmp = get_target();

    cpuid_ptr = (uint) & answer;
    cpuid_level = level;

    cpuid_space = cache_alloc(256);
    set_target(cpuid_space);
    raw_push_l_r(0); /* eax */
    raw_push_l_r(1); /* ecx */
    raw_push_l_r(2); /* edx */
    raw_push_l_r(3); /* ebx */
    raw_push_l_r(7); /* edi */
    raw_mov_l_rm(0, (uint) & cpuid_level);
    raw_cpuid(0);
    raw_mov_l_rm(7, (uint) & cpuid_ptr);
    raw_mov_l_Rr(7, 0, 0);
    raw_mov_l_Rr(7, 1, 4);
    raw_mov_l_Rr(7, 2, 8);
    raw_mov_l_Rr(7, 3, 12);
    raw_pop_l_r(7);
    raw_pop_l_r(3);
    raw_pop_l_r(2);
    raw_pop_l_r(1);
    raw_pop_l_r(0);
    raw_ret();
    set_target(tmp);

    ((cpuop_func*)cpuid_space)(0);
    cache_free(cpuid_space);
    return answer;
}

static void raw_init_cpu(void)
{
    x86_regs x;
    uint maxlev;

    x = cpuid(0);
    maxlev = x.eax;
    Logger::Write(L"Max CPUID level=%d Processor is %c%c%c%c%c%c%c%c%c%c%c%c\n",
        maxlev,
        x.ebx,
        x.ebx >> 8,
        x.ebx >> 16,
        x.ebx >> 24,
        x.edx,
        x.edx >> 8,
        x.edx >> 16,
        x.edx >> 24,
        x.ecx,
        x.ecx >> 8,
        x.ecx >> 16,
        x.ecx >> 24
        );
    have_rat_stall = (x.ecx == 0x6c65746e);

    if (maxlev >= 1)
    {
        x = cpuid(1);
        if (x.edx & (1 << 15))
            have_cmov = 1;
    }
    have_rat_stall = 1;
    #if 0
    if (!have_cmov)
        have_rat_stall = 0;
    #endif
    #if 0
    Logger::Write(L"have_cmov=%d, avoid_cmov=%d, have_rat_stall=%d\n",
        have_cmov, currprefs.avoid_cmov, have_rat_stall);
    if (currprefs.avoid_cmov)
    {
        Logger::Write(L"Disabling cmov use despite processor claiming to support it!\n");
        have_cmov = 0;
    }
    #else
    /* Dear Bernie, I don't want to keep around options which are useless, and not
       represented in the GUI anymore... Is this okay? */
    Logger::Write(L"have_cmov=%d, have_rat_stall=%d\n", have_cmov, have_rat_stall);
    #endif
    #if 0 /* For testing of non-cmov code! */
    have_cmov = 0;
    #endif
    #if 0 /* It appears that partial register writes are a bad idea even on
             AMD K7 cores, even though they are not supposed to have the
             dreaded rat stall. Why? Anyway, that's why we lie about it ;-) */
    if (have_cmov)
        have_rat_stall = 1;
    #endif
}
#endif

/*************************************************************************
* FPU stuff                                                             *
*************************************************************************/

static __forceinline void raw_fp_init(void)
{
    int i;

    for (i = 0; i < N_FREGS; i++)
        live.spos[i] = -2;
    live.tos = -1;  /* Stack is empty */
}

static __forceinline void raw_fp_cleanup_drop(void)
{
    #if 0
    /* using FINIT instead of popping all the entries.
       Seems to have side effects --- there is display corruption in
       Quake when this is used */
    if (live.tos > 1)
    {
        emit_byte(0x9b);
        emit_byte(0xdb);
        emit_byte(0xe3);
        live.tos = -1;
    }
    #endif
    while (live.tos >= 1)
    {
        emit_byte(0xde);
        emit_byte(0xd9);
        live.tos -= 2;
    }
    while (live.tos >= 0)
    {
        emit_byte(0xdd);
        emit_byte(0xd8);
        live.tos--;
    }
    raw_fp_init();
}

static __forceinline void make_tos(int r)
{
    int p, q;

    if (live.spos[r] < 0)   /* Register not yet on stack */
    {
        emit_byte(0xd9);
        emit_byte(0xe8); /* Push '1' on the stack, just to grow it */
        live.tos++;
        live.spos[r] = live.tos;
        live.onstack[live.tos] = r;
        return;
    }
    /* Register is on stack */
    if (live.tos == live.spos[r])
        return;
    p = live.spos[r];
    q = live.onstack[live.tos];

    emit_byte(0xd9);
    emit_byte(0xc8 + live.tos - live.spos[r]);  /* exchange it with top of stack */
    live.onstack[live.tos] = r;
    live.spos[r] = live.tos;
    live.onstack[p] = q;
    live.spos[q] = p;
}

static __forceinline int stackpos(int r)
{
    if (live.spos[r] < 0)
        abort();
    if (live.tos < live.spos[r])
    {
        Logger::Write(L"JIT: Looking for spos for fnreg %d\n", r);
        abort();
    }
    return live.tos - live.spos[r];
}

/* IMO, calling usereg(r) makes no sense, if the register r should supply our function with
   an argument, because I would expect all arguments to be on the stack already, won't they?
   Thus, usereg(s) is always useless and also for every FRW d it's too late here now. PeterK
 */
static __forceinline void usereg(int r)
{
    if (live.spos[r] < 0)
    {
        //  Logger::Write (L"usereg wants to push reg %d onto the x87 stack calling make_tos\n", r);
        make_tos(r);
    }
}

/* This is called with one FP value in a reg *above* tos,
   which it will pop off the stack if necessary */
static __forceinline void tos_make(int r)
{
    if (live.spos[r] < 0)
    {
        live.tos++;
        live.spos[r] = live.tos;
        live.onstack[live.tos] = r;
        return;
    }
    emit_byte(0xdd);
    emit_byte(0xd8 + (live.tos + 1) - live.spos[r]);
    /* store top of stack in reg and pop it*/
}

LOWFUNC(NONE, WRITE, 2, raw_fmov_mr, (MEMW m, FR r))
{
    make_tos(r);
    emit_byte(0xdd);
    emit_byte(0x15);
    emit_long(m);
}
LENDFUNC(NONE, WRITE, 2, raw_fmov_mr, (MEMW m, FR r))

LOWFUNC(NONE, WRITE, 2, raw_fmov_mr_drop, (MEMW m, FR r))
{
    make_tos(r);
    emit_byte(0xdd);
    emit_byte(0x1d);
    emit_long(m);
    live.onstack[live.tos] = -1;
    live.tos--;
    live.spos[r] = -2;
}
LENDFUNC(NONE, WRITE, 2, raw_fmov_mr, (MEMW m, FR r))

LOWFUNC(NONE, READ, 2, raw_fmov_rm, (FW r, MEMR m))
{
    emit_byte(0xdd);
    emit_byte(0x05);
    emit_long(m);
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmov_rm, (FW r, MEMR m))

LOWFUNC(NONE, READ, 2, raw_fmovi_rm, (FW r, MEMR m))
{
    emit_byte(0xdb);
    emit_byte(0x05);
    emit_long(m);
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmovi_rm, (FW r, MEMR m))

LOWFUNC(NONE, WRITE, 3, raw_fmovi_mrb, (MEMW m, FR r, double* bounds))
{
    /* Clamp value to the given range and convert to integer.
       ideally, the clamping should be done using two FCMOVs, but
       this requires two free fp registers, and we can only be sure
       of having one. Using a jump for the lower bound and an FCMOV
       for the upper bound, we can do it with one scratch register.
     */

    int rs;
    usereg(r);
    rs = stackpos(r) + 1;

    /* Lower bound onto stack */
    emit_byte(0xdd);
    emit_byte(0x05);
    emit_long((uint) & bounds[0]); /* fld double from lower */

    /* Clamp to lower */
    emit_byte(0xdb);
    emit_byte(0xf0 + rs); /* fcomi lower,r */
    emit_byte(0x73);
    emit_byte(12);      /* jae to writeback */

    /* Upper bound onto stack */
    emit_byte(0xdd);
    emit_byte(0xd8);    /* fstp st(0) */
    emit_byte(0xdd);
    emit_byte(0x05);
    emit_long((uint) & bounds[1]); /* fld double from upper */

    /* Clamp to upper */
    emit_byte(0xdb);
    emit_byte(0xf0 + rs); /* fcomi upper,r */
    emit_byte(0xdb);
    emit_byte(0xd0 + rs); /* fcmovnbe upper,r */

    /* Store to destination */
    emit_byte(0xdb);
    emit_byte(0x1d);
    emit_long(m);
}
LENDFUNC(NONE, WRITE, 3, raw_fmovi_mrb, (MEMW m, FR r, double* bounds))

LOWFUNC(NONE, READ, 2, raw_fmovs_rm, (FW r, MEMR m))
{
    emit_byte(0xd9);
    emit_byte(0x05);
    emit_long(m);
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmovs_rm, (FW r, MEMR m))

LOWFUNC(NONE, WRITE, 2, raw_fmovs_mr, (MEMW m, FR r))
{
    make_tos(r);
    emit_byte(0xd9);
    emit_byte(0x15);
    emit_long(m);
}
LENDFUNC(NONE, WRITE, 2, raw_fmovs_mr, (MEMW m, FR r))

LOWFUNC(NONE, NONE, 1, raw_fcuts_r, (FRW r))
{
    make_tos(r);     /* TOS = r */
    emit_byte(0x83);
    emit_byte(0xc4);
    emit_byte(0xfc); /* add -4 to esp */
    emit_byte(0xd9);
    emit_byte(0x1c);
    emit_byte(0x24); /* fstp store r as SINGLE to [esp] and pop */
    emit_byte(0xd9);
    emit_byte(0x04);
    emit_byte(0x24); /* fld load r as SINGLE from [esp] */
    emit_byte(0x83);
    emit_byte(0xc4);
    emit_byte(0x04); /* add +4 to esp */
}
LENDFUNC(NONE, NONE, 1, raw_fcuts_r, (FRW r))

LOWFUNC(NONE, NONE, 1, raw_fcut_r, (FRW r))
{
    make_tos(r);     /* TOS = r */
    emit_byte(0x83);
    emit_byte(0xc4);
    emit_byte(0xf8); /* add -8 to esp */
    emit_byte(0xdd);
    emit_byte(0x1c);
    emit_byte(0x24); /* fstp store r as DOUBLE to [esp] and pop */
    emit_byte(0xdd);
    emit_byte(0x04);
    emit_byte(0x24); /* fld load r as DOUBLE from [esp] */
    emit_byte(0x83);
    emit_byte(0xc4);
    emit_byte(0x08); /* add +8 to esp */
}
LENDFUNC(NONE, NONE, 1, raw_fcut_r, (FRW r))

LOWFUNC(NONE, READ, 2, raw_fmovl_ri, (FW r, IMMS i))
{
    emit_byte(0x68);
    emit_long(i);    /* push immediate32 onto [esp] */
    emit_byte(0xdb);
    emit_byte(0x04);
    emit_byte(0x24); /* fild load m32int from [esp] */
    emit_byte(0x83);
    emit_byte(0xc4);
    emit_byte(0x04); /* add +4 to esp */
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmovl_ri, (FW r, IMMS i))

LOWFUNC(NONE, READ, 2, raw_fmovs_ri, (FW r, IMM i))
{
    emit_byte(0x68);
    emit_long(i);    /* push immediate32 onto [esp] */
    emit_byte(0xd9);
    emit_byte(0x04);
    emit_byte(0x24); /* fld load m32real from [esp] */
    emit_byte(0x83);
    emit_byte(0xc4);
    emit_byte(0x04); /* add +4 to esp */
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmovs_ri, (FW r, IMM i))

LOWFUNC(NONE, READ, 3, raw_fmov_ri, (FW r, IMM i1, IMM i2))
{
    emit_byte(0x68);
    emit_long(i2);   /* push immediate32 onto [esp] */
    emit_byte(0x68);
    emit_long(i1);   /* push immediate32 onto [esp] */
    emit_byte(0xdd);
    emit_byte(0x04);
    emit_byte(0x24); /* fld load m64real from [esp] */
    emit_byte(0x83);
    emit_byte(0xc4);
    emit_byte(0x08); /* add +8 to esp */
    tos_make(r);
}
LENDFUNC(NONE, READ, 3, raw_fmov_ri, (FW r, IMM i1, IMM i2))

LOWFUNC(NONE, READ, 4, raw_fmov_ext_ri, (FW r, IMM i1, IMM i2, IMM i3))
{
    emit_byte(0x68);
    emit_long(i3);   /* push immediate32 onto [esp] */
    emit_byte(0x68);
    emit_long(i2);   /* push immediate32 onto [esp] */
    emit_byte(0x68);
    emit_long(i1);   /* push immediate32 onto [esp] */
    emit_byte(0xdb);
    emit_byte(0x2c);
    emit_byte(0x24); /* fld load m80real from [esp] */
    emit_byte(0x83);
    emit_byte(0xc4);
    emit_byte(0x0c); /* add +12 to esp */
    tos_make(r);
}
LENDFUNC(NONE, READ, 4, raw_fmov_ext_ri, (FW r, IMM i1, IMM i2, IMMi3))

LOWFUNC(NONE, WRITE, 2, raw_fmov_ext_mr, (MEMW m, FR r))
{
    int rs;

    /* Stupid x87 can't write a long double to mem without popping the
       stack! */
    usereg(r);
    rs = stackpos(r);
    emit_byte(0xd9);     /* Get a copy to the top of stack */
    emit_byte(0xc0 + rs);

    emit_byte(0xdb);  /* store and pop it */
    emit_byte(0x3d);
    emit_long(m);
}
LENDFUNC(NONE, WRITE, 2, raw_fmov_ext_mr, (MEMW m, FR r))

LOWFUNC(NONE, WRITE, 2, raw_fmov_ext_mr_drop, (MEMW m, FR r))
{
    make_tos(r);
    emit_byte(0xdb);  /* store and pop it */
    emit_byte(0x3d);
    emit_long(m);
    live.onstack[live.tos] = -1;
    live.tos--;
    live.spos[r] = -2;
}
LENDFUNC(NONE, WRITE, 2, raw_fmov_ext_mr, (MEMW m, FR r))

LOWFUNC(NONE, READ, 2, raw_fmov_ext_rm, (FW r, MEMR m))
{
    emit_byte(0xdb);
    emit_byte(0x2d);
    emit_long(m);
    tos_make(r);
}
LENDFUNC(NONE, READ, 2, raw_fmov_ext_rm, (FW r, MEMR m))

LOWFUNC(NONE, NONE, 1, raw_fmov_pi, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xeb);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_pi, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_log10_2, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xec);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_log10_2, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_log2_e, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xea);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_log2_e, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_loge_2, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xed);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_loge_2, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_1, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xe8);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_1, (FW r))

LOWFUNC(NONE, NONE, 1, raw_fmov_0, (FW r))
{
    emit_byte(0xd9);
    emit_byte(0xee);
    tos_make(r);
}
LENDFUNC(NONE, NONE, 1, raw_fmov_0, (FW r))

LOWFUNC(NONE, NONE, 2, raw_fmov_rr, (FW d, FR s))
{
    int ds;

    ds = stackpos(s);
    if (ds == 0 && live.spos[d] >= 0)
    {
        /* source is on top of stack, and we already have the dest */
        int dd = stackpos(d);
        emit_byte(0xdd);
        emit_byte(0xd0 + dd);
    }
    else
    {
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source on tos */
        tos_make(d); /* store to destination, pop if necessary */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fmov_rr, (FW d, FR s))

LOWFUNC(NONE, READ, 2, raw_fldcw_m_indexed, (R4 index, IMM base))
{
    emit_byte(0xd9);
    emit_byte(0xa8 + index);
    emit_long(base);
}
LENDFUNC(NONE, READ, 2, raw_fldcw_m_indexed, (R4 index, IMM base))

LOWFUNC(NONE, NONE, 2, raw_fsqrt_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
        emit_byte(0xd9);
        emit_byte(0xfa); /* fsqrt sqrt(x) */
        tos_make(d);    /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xfa); /* fsqrt y=sqrt(x) */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fsqrt_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fabs_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
        emit_byte(0xd9);
        emit_byte(0xe1); /* fabs abs(x) */
        tos_make(d);    /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xe1); /* fabs y=abs(x) */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fabs_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_frndint_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
        emit_byte(0xd9);
        emit_byte(0xfc); /* frndint int(x) */
        tos_make(d);    /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xfc); /* frndint y=int(x) */
    }
}
LENDFUNC(NONE, NONE, 2, raw_frndint_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fgetexp_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
        emit_byte(0xd9);
        emit_byte(0xf4); /* fxtract exp push man */
        emit_byte(0xdd);
        emit_byte(0xd8); /* fstp just pop man */
        tos_make(d);    /* store exp to destination */
    }
    else
    {
        make_tos(d);    /* tos=x=y */
        emit_byte(0xd9);
        emit_byte(0xf4); /* fxtract exp push man */
        emit_byte(0xdd);
        emit_byte(0xd8); /* fstp just pop man */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fgetexp_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fgetman_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
        emit_byte(0xd9);
        emit_byte(0xf4); /* fxtract exp push man */
        emit_byte(0xdd);
        emit_byte(0xd9); /* fstp copy man up & pop */
        tos_make(d);    /* store man to destination */
    }
    else
    {
        make_tos(d);    /* tos=x=y */
        emit_byte(0xd9);
        emit_byte(0xf4); /* fxtract exp push man */
        emit_byte(0xdd);
        emit_byte(0xd9); /* fstp copy man up & pop */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fgetman_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fsin_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
        emit_byte(0xd9);
        emit_byte(0xfe); /* fsin sin(x) */
        tos_make(d);    /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xfe); /* fsin y=sin(x) */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fsin_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fcos_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
        emit_byte(0xd9);
        emit_byte(0xff); /* fcos cos(x) */
        tos_make(d);    /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xff); /* fcos y=cos(x) */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fcos_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_ftan_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
        emit_byte(0xd9);
        emit_byte(0xf2); /* fptan tan(x)=y/1.0 */
        emit_byte(0xdd);
        emit_byte(0xd8); /* fstp pop 1.0 */
        tos_make(d);    /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xf2); /* fptan tan(x)=y/1.0 */
        emit_byte(0xdd);
        emit_byte(0xd8); /* fstp pop 1.0 */
    }
}
LENDFUNC(NONE, NONE, 2, raw_ftan_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 3, raw_fsincos_rr, (FW d, FW c, FR s))
{
    int ds;

    if (s == d)
    {
        // Logger::Write (L"FSINCOS src = dest\n");
        make_tos(s);
        emit_byte(0xd9);
        emit_byte(0xfb); /* fsincos sin(x) push cos(x) */
        tos_make(c); /* store cos(x) to c */
        return;
    }

    ds = stackpos(s);
    emit_byte(0xd9);
    emit_byte(0xc0 + ds);  /* fld x */
    emit_byte(0xd9);
    emit_byte(0xfb);     /* fsincos sin(x) push cos(x) */
    if (live.spos[c] < 0)
    {
        if (live.spos[d] < 0) /* occupy both regs directly */
        {
            live.tos++;
            live.spos[d] = live.tos;
            live.onstack[live.tos] = d; /* sin(x) comes first */
            live.tos++;
            live.spos[c] = live.tos;
            live.onstack[live.tos] = c;
        }
        else
        {
            emit_byte(0xd9);
            emit_byte(0xc9); /* fxch swap cos(x) with sin(x) */
            emit_byte(0xdd); /* store sin(x) to d & pop */
            emit_byte(0xd8 + (live.tos + 2) - live.spos[d]);
            live.tos++;  /* occupy a reg for cos(x) here */
            live.spos[c] = live.tos;
            live.onstack[live.tos] = c;
        }
    }
    else
    {
        emit_byte(0xdd); /* store cos(x) to c & pop */
        emit_byte(0xd8 + (live.tos + 2) - live.spos[c]);
        tos_make(d); /* store sin(x) to destination */
    }
}
LENDFUNC(NONE, NONE, 3, raw_fsincos_rr, (FW d, FW c, FR s))

float one = 1;

LOWFUNC(NONE, NONE, 2, raw_fscale_rr, (FRW d, FR s))
{
    int ds;

    if (live.spos[d] == live.tos && live.spos[s] == live.tos - 1)
    {
        // Logger::Write (L"fscale found x in TOS-1 and y in TOS\n");
        emit_byte(0xd9);
        emit_byte(0xfd); /* fscale y*(2^x) */
    }
    else
    {
        make_tos(s);    /* tos=x */
        ds = stackpos(d);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld y */
        emit_byte(0xd9);
        emit_byte(0xfd); /* fscale y*(2^x) */
        tos_make(d);    /* store y=y*(2^x) */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fscale_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_ftwotox_rr, (FW d, FR s))
{
    int ds;

    ds = stackpos(s);
    emit_byte(0xd9);
    emit_byte(0xc0 + ds); /* fld x */
    emit_byte(0xd9);
    emit_byte(0xfc);    /* frndint int(x) */
    emit_byte(0xd9);
    emit_byte(0xc1 + ds); /* fld x again */
    emit_byte(0xd8);
    emit_byte(0xe1);    /* fsub frac(x) = x - int(x) */
    emit_byte(0xd9);
    emit_byte(0xf0);    /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one); /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);    /* fscale (2^frac(x))*2^int(x) */
    emit_byte(0xdd);
    emit_byte(0xd9);    /* fstp copy & pop */
    tos_make(d);        /* store y=2^x */
}
LENDFUNC(NONE, NONE, 2, raw_ftwotox_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fetox_rr, (FW d, FR s))
{
    int ds;

    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xea);    /* fldl2e log2(e) */
    emit_byte(0xd8);
    emit_byte(0xc9);    /* fmul x*log2(e) */
    emit_byte(0xdd);
    emit_byte(0xd1);    /* fst copy up */
    emit_byte(0xd9);
    emit_byte(0xfc);    /* frndint int(x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap top two elements */
    emit_byte(0xd8);
    emit_byte(0xe1);    /* fsub x*log2(e) - int(x*log2(e))  */
    emit_byte(0xd9);
    emit_byte(0xf0);    /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one);  /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);    /* fscale (2^frac(x))*2^int(x*log2(e)) */
    emit_byte(0xdd);
    emit_byte(0xd9);    /* fstp copy & pop */
    if (s != d)
        tos_make(d);  /* store y=e^x */
}
LENDFUNC(NONE, NONE, 2, raw_fetox_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fetoxM1_rr, (FW d, FR s))
{
    int ds;

    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xea);    /* fldl2e log2(e) */
    emit_byte(0xd8);
    emit_byte(0xc9);    /* fmul x*log2(e) */
    emit_byte(0xdd);
    emit_byte(0xd1);    /* fst copy up */
    emit_byte(0xd9);
    emit_byte(0xfc);    /* frndint int(x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap top two elements */
    emit_byte(0xd8);
    emit_byte(0xe1);    /* fsub x*log2(e) - int(x*log2(e))  */
    emit_byte(0xd9);
    emit_byte(0xf0);    /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd9);
    emit_byte(0xfd);    /* fscale ((2^frac(x))-1)*2^int(x*log2(e)) */
    emit_byte(0xdd);
    emit_byte(0xd9);    /* fstp copy & pop */
    if (s != d)
        tos_make(d);  /* store y=(e^x)-1 */
}
LENDFUNC(NONE, NONE, 2, raw_fetoxM1_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_ftentox_rr, (FW d, FR s))
{
    int ds;

    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xe9);    /* fldl2t log2(10) */
    emit_byte(0xd8);
    emit_byte(0xc9);    /* fmul x*log2(10) */
    emit_byte(0xdd);
    emit_byte(0xd1);    /* fst copy up */
    emit_byte(0xd9);
    emit_byte(0xfc);    /* frndint int(x*log2(10)) */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap top two elements */
    emit_byte(0xd8);
    emit_byte(0xe1);    /* fsub x*log2(10) - int(x*log2(10))  */
    emit_byte(0xd9);
    emit_byte(0xf0);    /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one);  /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);    /* fscale (2^frac(x))*2^int(x*log2(10)) */
    emit_byte(0xdd);
    emit_byte(0xd9);    /* fstp copy & pop */
    if (s != d)
        tos_make(d);  /* store y=10^x */
}
LENDFUNC(NONE, NONE, 2, raw_ftentox_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_flog2_rr, (FW d, FR s))
{
    int ds;

    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xe8);    /* fld1 1 */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap 1 with x */
    emit_byte(0xd9);
    emit_byte(0xf1);    /* fyl2x 1*log2(x) */
    if (s != d)
        tos_make(d);  /* store y=log2(x) */
}
LENDFUNC(NONE, NONE, 2, raw_flog2_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_flogN_rr, (FW d, FR s))
{
    int ds;

    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xed);    /* fldln2 logN(2) */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap logN(2) with x */
    emit_byte(0xd9);
    emit_byte(0xf1);    /* fyl2x logN(2)*log2(x) */
    if (s != d)
        tos_make(d);  /* store y=logN(x) */
}
LENDFUNC(NONE, NONE, 2, raw_flogN_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_flogNP1_rr, (FW d, FR s))
{
    int ds;

    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xed);    /* fldln2 logN(2) */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap logN(2) with x */
    emit_byte(0xd9);
    emit_byte(0xf9);    /* fyl2xp1 logN(2)*log2(x+1) */
    if (s != d)
        tos_make(d);  /* store y=logN(x+1) */
}
LENDFUNC(NONE, NONE, 2, raw_flogNP1_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_flog10_rr, (FW d, FR s))
{
    int ds;

    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xec);    /* fldlg2 log10(2) */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap log10(2) with x */
    emit_byte(0xd9);
    emit_byte(0xf1);    /* fyl2x log10(2)*log2(x) */
    if (s != d)
        tos_make(d);  /* store y=log10(x) */
}
LENDFUNC(NONE, NONE, 2, raw_flog10_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fasin_rr, (FW d, FR s))
{
    int ds;

    ds = stackpos(s);
    emit_byte(0xd9);
    emit_byte(0xc0 + ds); /* fld x */
    emit_byte(0xd8);
    emit_byte(0xc8);    /* fmul x*x */
    emit_byte(0xd9);
    emit_byte(0xe8);    /* fld 1.0 */
    emit_byte(0xde);
    emit_byte(0xe1);    /* fsubrp 1 - (x^2) */
    emit_byte(0xd9);
    emit_byte(0xfa);    /* fsqrt sqrt(1-(x^2)) */
    emit_byte(0xd9);
    emit_byte(0xc1 + ds); /* fld x again */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap x with sqrt(1-(x^2))  */
    emit_byte(0xd9);
    emit_byte(0xf3);    /* fpatan atan(x/sqrt(1-(x^2))) & pop */
    tos_make(d);        /* store y=asin(x) */
}
LENDFUNC(NONE, NONE, 2, raw_fasin_rr, (FW d, FR s))

static uint pihalf[] = { 0x2168c234, 0xc90fdaa2, 0x3fff }; // LSB=0 to get acos(1)=0
LOWFUNC(NONE, NONE, 2, raw_facos_rr, (FW d, FR s))
{
    int ds;

    ds = stackpos(s);
    emit_byte(0xd9);
    emit_byte(0xc0 + ds); /* fld x */
    emit_byte(0xd8);
    emit_byte(0xc8);    /* fmul x*x */
    emit_byte(0xd9);
    emit_byte(0xe8);    /* fld 1.0 */
    emit_byte(0xde);
    emit_byte(0xe1);    /* fsubrp 1 - (x^2) */
    emit_byte(0xd9);
    emit_byte(0xfa);    /* fsqrt sqrt(1-(x^2)) */
    emit_byte(0xd9);
    emit_byte(0xc1 + ds); /* fld x again */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap x with sqrt(1-(x^2))  */
    emit_byte(0xd9);
    emit_byte(0xf3);    /* fpatan atan(x/sqrt(1-(x^2))) & pop */
    emit_byte(0xdb);
    emit_byte(0x2d);
    emit_long((uint) & pihalf); /* fld load pi/2 from pihalf */
    emit_byte(0xde);
    emit_byte(0xe1);    /* fsubrp pi/2 - asin(x) & pop */
    tos_make(d);        /* store y=acos(x) */
}
LENDFUNC(NONE, NONE, 2, raw_facos_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fatan_rr, (FW d, FR s))
{
    int ds;

    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xe8);    /* fld 1.0 */
    emit_byte(0xd9);
    emit_byte(0xf3);    /* fpatan atan(x)/1  & pop*/
    if (s != d)
        tos_make(d);  /* store y=atan(x) */
}
LENDFUNC(NONE, NONE, 2, raw_fatan_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fatanh_rr, (FW d, FR s))
{
    int ds;

    ds = stackpos(s);
    emit_byte(0xd9);
    emit_byte(0xc0 + ds); /* fld x */
    emit_byte(0xd9);
    emit_byte(0xe8);    /* fld 1.0 */
    emit_byte(0xdc);
    emit_byte(0xc1);    /* fadd 1 + x */
    emit_byte(0xd8);
    emit_byte(0xe2 + ds); /* fsub 1 - x */
    emit_byte(0xde);
    emit_byte(0xf9);    /* fdivp (1+x)/(1-x) */
    emit_byte(0xd9);
    emit_byte(0xed);    /* fldl2e logN(2) */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap logN(2) with (1+x)/(1-x) */
    emit_byte(0xd9);
    emit_byte(0xf1);    /* fyl2x logN(2)*log2((1+x)/(1-x)) pop */
    emit_byte(0xd9);
    emit_byte(0xe8);    /* fld 1.0 */
    emit_byte(0xd9);
    emit_byte(0xe0);    /* fchs -1.0 */
    emit_byte(0xd9);
    emit_byte(0xc9);    /* fxch swap */
    emit_byte(0xd9);
    emit_byte(0xfd);    /* fscale logN((1+x)/(1-x)) * 2^(-1) */
    emit_byte(0xdd);
    emit_byte(0xd9);    /* fstp copy & pop */
    tos_make(d);        /* store y=atanh(x) */
}
LENDFUNC(NONE, NONE, 2, raw_fatanh_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fsinh_rr, (FW d, FR s))
{
    int ds, tr;

    tr = live.onstack[live.tos + 3];
    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xea);     /* fldl2e log2(e) */
    emit_byte(0xd8);
    emit_byte(0xc9);     /* fmul x*log2(e) */
    emit_byte(0xdd);
    emit_byte(0xd1);     /* fst copy x*log2(e) */
    if (tr >= 0)
    {
        emit_byte(0xd9);
        emit_byte(0xca); /* fxch swap with temp-reg */
        emit_byte(0x83);
        emit_byte(0xc4);
        emit_byte(0xf4); /* add -12 to esp */
        emit_byte(0xdb);
        emit_byte(0x3c);
        emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */
    }
    emit_byte(0xd9);
    emit_byte(0xe0);     /* fchs -x*log2(e) */
    emit_byte(0xd9);
    emit_byte(0xc0);     /* fld -x*log2(e) again */
    emit_byte(0xd9);
    emit_byte(0xfc);     /* frndint int(-x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xc9);     /* fxch swap */
    emit_byte(0xd8);
    emit_byte(0xe1);     /* fsub -x*log2(e) - int(-x*log2(e))  */
    emit_byte(0xd9);
    emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one);  /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xca);     /* fxch swap e^-x with x*log2(e) in tr */
    emit_byte(0xdd);
    emit_byte(0xd1);     /* fst copy x*log2(e) */
    emit_byte(0xd9);
    emit_byte(0xfc);     /* frndint int(x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xc9);     /* fxch swap */
    emit_byte(0xd8);
    emit_byte(0xe1);     /* fsub x*log2(e) - int(x*log2(e))  */
    emit_byte(0xd9);
    emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one);  /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
    emit_byte(0xdd);
    emit_byte(0xd9);     /* fstp copy e^x & pop */
    if (tr >= 0)
    {
        emit_byte(0xdb);
        emit_byte(0x2c);
        emit_byte(0x24); /* fld load temp-reg from [esp] */
        emit_byte(0x83);
        emit_byte(0xc4);
        emit_byte(0x0c); /* add +12 to esp */
        emit_byte(0xd9);
        emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */
        emit_byte(0xde);
        emit_byte(0xe9); /* fsubp (e^x)-(e^-x) */
    }
    else
    {
        emit_byte(0xde);
        emit_byte(0xe1); /* fsubrp (e^x)-(e^-x) */
    }
    emit_byte(0xd9);
    emit_byte(0xe8);     /* fld 1.0 */
    emit_byte(0xd9);
    emit_byte(0xe0);     /* fchs -1.0 */
    emit_byte(0xd9);
    emit_byte(0xc9);     /* fxch swap */
    emit_byte(0xd9);
    emit_byte(0xfd);     /* fscale ((e^x)-(e^-x))/2 */
    emit_byte(0xdd);
    emit_byte(0xd9);     /* fstp copy & pop */
    if (s != d)
        tos_make(d);  /* store y=sinh(x) */
}
LENDFUNC(NONE, NONE, 2, raw_fsinh_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fcosh_rr, (FW d, FR s))
{
    int ds, tr;

    tr = live.onstack[live.tos + 3];
    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xea);     /* fldl2e log2(e) */
    emit_byte(0xd8);
    emit_byte(0xc9);     /* fmul x*log2(e) */
    emit_byte(0xdd);
    emit_byte(0xd1);     /* fst copy x*log2(e) */
    if (tr >= 0)
    {
        emit_byte(0xd9);
        emit_byte(0xca); /* fxch swap with temp-reg */
        emit_byte(0x83);
        emit_byte(0xc4);
        emit_byte(0xf4); /* add -12 to esp */
        emit_byte(0xdb);
        emit_byte(0x3c);
        emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */
    }
    emit_byte(0xd9);
    emit_byte(0xe0);     /* fchs -x*log2(e) */
    emit_byte(0xd9);
    emit_byte(0xc0);     /* fld -x*log2(e) again */
    emit_byte(0xd9);
    emit_byte(0xfc);     /* frndint int(-x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xc9);     /* fxch swap */
    emit_byte(0xd8);
    emit_byte(0xe1);     /* fsub -x*log2(e) - int(-x*log2(e))  */
    emit_byte(0xd9);
    emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one);  /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xca);     /* fxch swap e^-x with x*log2(e) in tr */
    emit_byte(0xdd);
    emit_byte(0xd1);     /* fst copy x*log2(e) */
    emit_byte(0xd9);
    emit_byte(0xfc);     /* frndint int(x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xc9);     /* fxch swap */
    emit_byte(0xd8);
    emit_byte(0xe1);     /* fsub x*log2(e) - int(x*log2(e))  */
    emit_byte(0xd9);
    emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one);  /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
    emit_byte(0xdd);
    emit_byte(0xd9);     /* fstp copy e^x & pop */
    if (tr >= 0)
    {
        emit_byte(0xdb);
        emit_byte(0x2c);
        emit_byte(0x24); /* fld load temp-reg from [esp] */
        emit_byte(0x83);
        emit_byte(0xc4);
        emit_byte(0x0c); /* add +12 to esp */
        emit_byte(0xd9);
        emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */
    }
    emit_byte(0xde);
    emit_byte(0xc1);     /* faddp (e^x)+(e^-x) */
    emit_byte(0xd9);
    emit_byte(0xe8);     /* fld 1.0 */
    emit_byte(0xd9);
    emit_byte(0xe0);     /* fchs -1.0 */
    emit_byte(0xd9);
    emit_byte(0xc9);     /* fxch swap */
    emit_byte(0xd9);
    emit_byte(0xfd);     /* fscale ((e^x)+(e^-x))/2 */
    emit_byte(0xdd);
    emit_byte(0xd9);     /* fstp copy & pop */
    if (s != d)
        tos_make(d);  /* store y=cosh(x) */
}
LENDFUNC(NONE, NONE, 2, raw_fcosh_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_ftanh_rr, (FW d, FR s))
{
    int ds, tr;

    tr = live.onstack[live.tos + 3];
    if (s == d)
        make_tos(s);
    else
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld x */
    }
    emit_byte(0xd9);
    emit_byte(0xea);     /* fldl2e log2(e) */
    emit_byte(0xd8);
    emit_byte(0xc9);     /* fmul x*log2(e) */
    emit_byte(0xdd);
    emit_byte(0xd1);     /* fst copy x*log2(e) */
    if (tr >= 0)
    {
        emit_byte(0xd9);
        emit_byte(0xca); /* fxch swap with temp-reg */
        emit_byte(0x83);
        emit_byte(0xc4);
        emit_byte(0xf4); /* add -12 to esp */
        emit_byte(0xdb);
        emit_byte(0x3c);
        emit_byte(0x24); /* fstp store temp-reg to [esp] & pop */
    }
    emit_byte(0xd9);
    emit_byte(0xe0);     /* fchs -x*log2(e) */
    emit_byte(0xd9);
    emit_byte(0xc0);     /* fld -x*log2(e) again */
    emit_byte(0xd9);
    emit_byte(0xfc);     /* frndint int(-x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xc9);     /* fxch swap */
    emit_byte(0xd8);
    emit_byte(0xe1);     /* fsub -x*log2(e) - int(-x*log2(e))  */
    emit_byte(0xd9);
    emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one);  /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xca);     /* fxch swap e^-x with x*log2(e) */
    emit_byte(0xdd);
    emit_byte(0xd1);     /* fst copy x*log2(e) */
    emit_byte(0xd9);
    emit_byte(0xfc);     /* frndint int(x*log2(e)) */
    emit_byte(0xd9);
    emit_byte(0xc9);     /* fxch swap */
    emit_byte(0xd8);
    emit_byte(0xe1);     /* fsub x*log2(e) - int(x*log2(e))  */
    emit_byte(0xd9);
    emit_byte(0xf0);     /* f2xm1 (2^frac(x))-1 */
    emit_byte(0xd8);
    emit_byte(0x05);
    emit_long((uint) & one);  /* fadd (2^frac(x))-1 + 1 */
    emit_byte(0xd9);
    emit_byte(0xfd);     /* fscale (2^frac(x))*2^int(x*log2(e)) */
    emit_byte(0xdd);
    emit_byte(0xd1);     /* fst copy e^x */
    emit_byte(0xd8);
    emit_byte(0xc2);     /* fadd (e^x)+(e^-x) */
    emit_byte(0xd9);
    emit_byte(0xca);     /* fxch swap with e^-x */
    emit_byte(0xde);
    emit_byte(0xe9);     /* fsubp (e^x)-(e^-x) */
    if (tr >= 0)
    {
        emit_byte(0xdb);
        emit_byte(0x2c);
        emit_byte(0x24); /* fld load temp-reg from [esp] */
        emit_byte(0x83);
        emit_byte(0xc4);
        emit_byte(0x0c); /* add +12 to esp */
        emit_byte(0xd9);
        emit_byte(0xca); /* fxch swap temp-reg with e^-x in tr */
        emit_byte(0xde);
        emit_byte(0xf9); /* fdivp ((e^x)-(e^-x))/((e^x)+(e^-x)) */
    }
    else
    {
        emit_byte(0xde);
        emit_byte(0xf1); /* fdivrp ((e^x)-(e^-x))/((e^x)+(e^-x)) */
    }
    if (s != d)
        tos_make(d);  /* store y=tanh(x) */
}
LENDFUNC(NONE, NONE, 2, raw_ftanh_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fneg_rr, (FW d, FR s))
{
    int ds;

    if (d != s)
    {
        ds = stackpos(s);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* duplicate source */
        emit_byte(0xd9);
        emit_byte(0xe0); /* take fchs */
        tos_make(d); /* store to destination */
    }
    else
    {
        make_tos(d);
        emit_byte(0xd9);
        emit_byte(0xe0); /* take fchs */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fneg_rr, (FW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fadd_rr, (FRW d, FR s))
{
    int ds;

    if (live.spos[s] == live.tos)
    {
        /* Source is on top of stack */
        ds = stackpos(d);
        emit_byte(0xdc);
        emit_byte(0xc0 + ds); /* add source to dest*/
    }
    else
    {
        make_tos(d);
        ds = stackpos(s);

        emit_byte(0xd8);
        emit_byte(0xc0 + ds); /* add source to dest*/
    }
}
LENDFUNC(NONE, NONE, 2, raw_fadd_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fsub_rr, (FRW d, FR s))
{
    int ds;

    if (live.spos[s] == live.tos)
    {
        /* Source is on top of stack */
        ds = stackpos(d);
        emit_byte(0xdc);
        emit_byte(0xe8 + ds); /* sub source from dest*/
    }
    else
    {
        make_tos(d);
        ds = stackpos(s);

        emit_byte(0xd8);
        emit_byte(0xe0 + ds); /* sub src from dest */
    }
}
LENDFUNC(NONE, NONE, 2, raw_fsub_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fcmp_rr, (FR d, FR s))
{
    int ds;

    make_tos(d);
    ds = stackpos(s);

    emit_byte(0xdd);
    emit_byte(0xe0 + ds); /* cmp dest with source*/
}
LENDFUNC(NONE, NONE, 2, raw_fcmp_rr, (FR d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fmul_rr, (FRW d, FR s))
{
    int ds;

    if (live.spos[s] == live.tos)
    {
        /* Source is on top of stack */
        ds = stackpos(d);
        emit_byte(0xdc);
        emit_byte(0xc8 + ds); /* mul dest by source*/
    }
    else
    {
        make_tos(d);
        ds = stackpos(s);

        emit_byte(0xd8);
        emit_byte(0xc8 + ds); /* mul dest by source*/
    }
}
LENDFUNC(NONE, NONE, 2, raw_fmul_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_fdiv_rr, (FRW d, FR s))
{
    int ds;

    if (live.spos[s] == live.tos)
    {
        /* Source is on top of stack */
        ds = stackpos(d);
        emit_byte(0xdc);
        emit_byte(0xf8 + ds); /* div dest by source */
    }
    else
    {
        make_tos(d);
        ds = stackpos(s);

        emit_byte(0xd8);
        emit_byte(0xf0 + ds); /* div dest by source*/
    }
}
LENDFUNC(NONE, NONE, 2, raw_fdiv_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_frem_rr, (FRW d, FR s))
{
    int ds;

    if (live.spos[d] == live.tos && live.spos[s] == live.tos - 1)
    {
        // Logger::Write (L"frem found x in TOS-1 and y in TOS\n");
        emit_byte(0xd9);
        emit_byte(0xf8); /* fprem rem(y/x) */
    }
    else
    {
        make_tos(s);    /* tos=x */
        ds = stackpos(d);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld y */
        emit_byte(0xd9);
        emit_byte(0xf8); /* fprem rem(y/x) */
        tos_make(d);    /* store y=rem(y/x) */
    }
}
LENDFUNC(NONE, NONE, 2, raw_frem_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 2, raw_frem1_rr, (FRW d, FR s))
{
    int ds;

    if (live.spos[d] == live.tos && live.spos[s] == live.tos - 1)
    {
        // Logger::Write (L"frem1 found x in TOS-1 and y in TOS\n");
        emit_byte(0xd9);
        emit_byte(0xf5); /* fprem1 rem1(y/x) */
    }
    else
    {
        make_tos(s);    /* tos=x */
        ds = stackpos(d);
        emit_byte(0xd9);
        emit_byte(0xc0 + ds); /* fld y */
        emit_byte(0xd9);
        emit_byte(0xf5); /* fprem1 rem1(y/x) */
        tos_make(d);    /* store y=rem(y/x) */
    }
}
LENDFUNC(NONE, NONE, 2, raw_frem1_rr, (FRW d, FR s))

LOWFUNC(NONE, NONE, 1, raw_ftst_r, (FR r))
{
    make_tos(r);
    emit_byte(0xd9);  /* ftst */
    emit_byte(0xe4);
}
LENDFUNC(NONE, NONE, 1, raw_ftst_r, (FR r))

static __forceinline void raw_fflags_into_flags(int r)
{
    int p;

    usereg(r);
    p = stackpos(r);

    emit_byte(0xd9);
    emit_byte(0xee); /* Push 0 */
    emit_byte(0xd9);
    emit_byte(0xc9 + p); /* swap top two around */
    if (have_cmov)
    {
        //  gb-- fucomi is for P6 cores only, not K6-2 then...
        emit_byte(0xdb);
        emit_byte(0xe9 + p); /* fucomi them */
    }
    else
    {
        emit_byte(0xdd);
        emit_byte(0xe1 + p); /* fucom them */
        emit_byte(0x9b);
        emit_byte(0xdf);
        emit_byte(0xe0); /* fstsw ax */
        raw_sahf(0); /* sahf */
    }
    emit_byte(0xdd);
    emit_byte(0xd9 + p);  /* store value back, and get rid of 0 */
}